Setup Code:

Problem 1 (Jeremy):

total_males <- ge %>% 
  select(year, total_males) %>% 
  mutate(total = total_males) %>% # 
  mutate(sex = rep("male", length(total_males))) %>% 
  select(year, total, sex)
total_females <- ge %>% 
  select(year, total_females) %>% 
  mutate(total = total_females) %>% 
  mutate(sex = rep("female", length(total_females))) %>% 
  select(year, total, sex)
total <- rbind(total_males, total_females)

total_enroll_plot <- ggplot(total, aes(x = year, y = total, shape = sex, color = sex)) +
  geom_point(size = 2, alpha = 0.5) +
  geom_smooth(method = "lm") +
  theme_classic() +
  theme(panel.border = element_rect(colour = "black", fill = NA, size = 0.7)) +
  labs(x = "Year", y = "Total Male & Female Graduate Student \nEnrollment in the U.S.") +
  scale_y_continuous() +
  scale_color_manual(values = c("grey11", "gray48")) +
  annotate("text", 
           x = 2010, 
           y = 940000, 
           label = "y = 9.42e-05x + 1.9e03", 
           family = "Times New Roman", 
           size = 3) +
  annotate("text", 
           x = 2010, 
           y = 880000, 
           label = "italic(R)^2 == 0.85", 
           parse = TRUE, 
           family = "Times New Roman", 
           size = 3) +
  annotate("text", 
           x = 1999, 
           y = 1600000, 
           label = "y = 3.26e-05x + 1.96e03", 
           family = "Times New Roman", 
           size = 3) +
  annotate("text", 
           x = 1999, 
           y = 1550000, 
           label = "italic(R)^2 == 0.98", 
           parse = TRUE, 
           family = "Times New Roman", 
           size = 3) 
total_enroll_plot

# Figure 1. Relationship between year and graduate student enrollment for both males and females (1967-2015). Year significantly predicts graduate student enrollment in the U.S. for both males (*b* = 9.42e-05, t(47) = 16.61, *p* <0.001) and females (*b* = 3.26e-05, t(47) = 51.66, *p* < 0.001) with a strong positive correlation between year and both sexes (Pearson's *r* (males) = 0.92, Pearson's *r* (females) =  0.99). The overall models (male enrollment = 9.42e-05(year) + 1.9e03, female enrollment = 3.26e-05(year) + 1.96e03) explain a significant amount of variance in salary for males (F(1,47) = 276, *p* < 0.001), R^2^ = 0.85 and females (F(1.9,47) = 2669, *p* < 0.001, R^2^ = 0.98). Gray region is the 95% confidence interval for the mean predicted values.

# lm for males
enrollment_males_lm <- lm(year ~ total, data = total_males)
summary(enrollment_males_lm)
## 
## Call:
## lm(formula = year ~ total, data = total_males)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.206  -4.268   1.297   4.049   9.309 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.902e+03  5.416e+00  351.17   <2e-16 ***
## total       9.422e-05  5.672e-06   16.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.509 on 47 degrees of freedom
## Multiple R-squared:  0.8545, Adjusted R-squared:  0.8514 
## F-statistic:   276 on 1 and 47 DF,  p-value: < 2.2e-16
cor.test(total_males$total, total_males$year) 
## 
##  Pearson's product-moment correlation
## 
## data:  total_males$total and total_males$year
## t = 16.612, df = 47, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8690777 0.9568547
## sample estimates:
##       cor 
## 0.9243741
# lm for females
enrollment_females_lm <- lm(year ~ total, data = total_females)
summary(enrollment_females_lm)
## 
## Call:
## lm(formula = year ~ total, data = total_females)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9028 -1.1964  0.0177  1.5362  3.0372 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.958e+03  7.021e-01 2788.29   <2e-16 ***
## total       3.262e-05  6.314e-07   51.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.9 on 47 degrees of freedom
## Multiple R-squared:  0.9827, Adjusted R-squared:  0.9823 
## F-statistic:  2669 on 1 and 47 DF,  p-value: < 2.2e-16
cor.test(total_females$total, total_females$year) 
## 
##  Pearson's product-moment correlation
## 
## data:  total_females$total and total_females$year
## t = 51.659, df = 47, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9845609 0.9951144
## sample estimates:
##       cor 
## 0.9913086
m_1967 <- total_males %>% # subset 1967 males
  filter(year == 1967)
m_2015 <- total_males %>% # subset 2015 males
  filter(year == 2015)
net_males <- m_2015$total - m_1967$total # net increase from 1967-2015
fold_males <- m_2015$total/m_1967$total # fold increase from 1967-2015
perc_males <- ((m_2015$total - m_1967$total)/m_1967$total) * 100 # % increase from 1967 - 2015


f_1967 <- total_females %>% # subset 1967 females
  filter(year == 1967)
f_2015 <- total_females %>% # subset 2015 females 
  filter(year == 2015)
net_females <- f_2015$total - f_1967$total # net increase from 1967-2015
fold_females <- f_2015$total/f_1967$total # fold increase from 1967-2015
perc_females <- ((f_2015$total - f_1967$total)/f_1967$total) * 100 # % increase from 1967 - 2015


# since 1967, there has been a dramatic rise in female graduate student enrollment in the U.S. as compared to males
# male enrollment has nearly doubled (1.9 fold), whereas female enrollment has seen a 6.5 fold increase
# net increase in males was 59,0865; net increase in females was 1,453,562

Problem 1 (Michael):

ge_adddecyear <- ge %>%
  mutate(year1 = year - min(year))

lm_gem1 <- lm(total_males ~ year1, data = ge_adddecyear)
summary(lm_gem1)
## 
## Call:
## lm(formula = total_males ~ year1, data = ge_adddecyear)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96461 -34861 -12841  51876  95766 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   727162      15209   47.81   <2e-16 ***
## year1           9069        546   16.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54050 on 47 degrees of freedom
## Multiple R-squared:  0.8545, Adjusted R-squared:  0.8514 
## F-statistic:   276 on 1 and 47 DF,  p-value: < 2.2e-16
lm_gef1 <- lm(total_females ~ year1, data = ge_adddecyear)
summary(lm_gef1)
## 
## Call:
## lm(formula = total_females ~ year1, data = ge_adddecyear)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89397 -48101  -7633  45267 129727 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 302388.4    16245.4   18.61   <2e-16 ***
## year1        30126.0      583.2   51.66   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57730 on 47 degrees of freedom
## Multiple R-squared:  0.9827, Adjusted R-squared:  0.9823 
## F-statistic:  2669 on 1 and 47 DF,  p-value: < 2.2e-16
# predict from model over the same time domain
ge_year <- data.frame(year1 = c(0:48))
ge_year$year <- c(1967:2015)
predict_m <- as.data.frame(predict(lm_gem1, newdata = ge_year, se.fit = TRUE, interval="confidence"))
predict_f <- as.data.frame(predict(lm_gef1, newdata = ge_year, se.fit = TRUE, interval="confidence"))
names(predict_m) <- paste(names(predict_m), "m", sep = "_")
names(predict_f) <- paste(names(predict_f), "f", sep = "_")

ge_fullpred <- cbind(ge_adddecyear, predict_m, predict_f)

#ge_predm <- cbind(ge_year, predict_m)
#ge_predf <- cbind(ge_year, predict_f)

# plot
ge_plot <- ggplot(ge_fullpred, aes(x = year, y = total_females)) +
  geom_ribbon(aes(ymin = fit.lwr_f, ymax = fit.upr_f), fill = "tomato3", alpha = 0.35) +
  geom_ribbon(aes(ymin = fit.lwr_m, ymax = fit.upr_m), fill = "steelblue3", alpha = 0.35) +
  geom_point(color = "tomato2") +
  geom_point(aes(x = year, y = total_males), color = "steelblue3") + 
  geom_line(aes(x = year, y = fit.fit_f), color = "tomato2") +
  geom_line(aes(x = year, y = fit.fit_m), color = "steelblue3") +
  scale_x_continuous(limits = c(1965, 2015), expand=c(0,0)) +
  scale_y_continuous(limits = c(-100000, 2100000), expand=c(0,0)) +
  labs(y = "Total enrollment", x = "Year") +
  theme(legend.position = "top") +
  theme_classic()

ge_plot

Problem 2 (Rachel):


pf_fem <- pf %>% 
  filter(field == "physical_science" | field == "engineering" | field == "education" | field == "humanities", sex == "F") %>%
  select(field, year, number) %>%
  spread(year, number) %>%
  select(-field)

#p-value < 0.05, there is a significant effect of year on number of female students in each field 

rownames(pf_fem) <- c("physical_science", "engineering", "education", "humanities")

pf_chisqfem <- chisq.test(pf_fem)
pf_chisqfem
 
phd_shifts <- pf_fem %>% 
  ggplot(aes(x = year, y = number)) +
  geom_col(aes(fill = field), position = "dodge") +
  scale_x_continuous(expand=c(0,0), breaks = seq(1985,2015, by=15)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_classic() +
  labs(
    x = "Year",
    y = "Female PhDs Awarded"
  )
phd_shifts

#chi-square (need to work on)
#phd_chi <- chisq.test(pf_fem) 
#phd_chi

Problem 2 (Michael):

pf_f <- pf %>% 
  filter(field == "physical_science" | field == "engineering" | field == "education" | field == "humanities", sex == "F") %>%
  select(field, year, number) %>%
  spread(year, number) %>%
  select(-field)

rownames(pf_f) <- c("physical_science", "engineering", "education", "humanities")
## Warning: Setting row names on a tibble is deprecated.
pf_chisqf <- chisq.test(pf_f)
pf_chisqf
## 
##  Pearson's Chi-squared test
## 
## data:  pf_f
## X-squared = 2073.3, df = 6, p-value < 2.2e-16
pf_fplt <- pf %>% 
  filter(field == "physical_science" | field == "engineering" | field == "education" | field == "humanities", sex == "F") %>%
  select(field, year, number)
pf_fplt$year <- as.character(pf_fplt$year)

pf_plt <- ggplot(pf_fplt, aes(x = field, y = number)) +
  geom_col(aes(fill = year), position = "dodge", colour = "black") +
  theme_classic() +
  theme(legend.position = "top") +
  labs(
    x = "Field",
    y = "PhDs awarded to females"
  ) +
  scale_y_continuous(expand=c(0,0)) +
  scale_x_discrete(labels = c("education","engineering","humanities","physical science")) +
  scale_fill_brewer(palette = "Greys")
pf_plt

ggsave(filename = "pf_plt.png", plot = pf_plt,
  scale = 1, width = 7, height = 3, units = "in",
  dpi = 300)

Problem 3 (Rachel):


melt_ms <- melt(ms)

#absolute difference in medians
#Employed Male/Female
emp_m_median <- median(ms$employment_male) #75167
emp_f_median <- median(ms$employment_female) #71750
emp_m_median - emp_f_median #3417
#Postdoc Male/Female
post_m_median <- median(ms$postdoc_male) #4800
post_f_median <- median(ms$postdoc_female) #4500
post_m_median - post_f_median # 3000

hist_emp_m = ggplot(ms, aes(ms$employment_male)) + geom_histogram()
qq_emp_m = ggplot(ms, aes(sample = ms$employment_male)) + geom_qq()

hist_emp_f = ggplot(ms, aes(ms$employment_female)) + geom_histogram()
qq_emp_f = ggplot(ms, aes(sample = ms$employment_female)) + geom_qq()

hist_post_m = ggplot(ms, aes(ms$postdoc_male)) + geom_histogram()
qq_post_m = ggplot(ms, aes(sample = ms$postdoc_male)) + geom_qq()

hist_post_f = ggplot(ms, aes(ms$postdoc_female)) + geom_histogram()
qq_post_f = ggplot(ms, aes(sample = ms$postdoc_female)) + geom_qq()

ftest_emp = var.test(ms$employment_male,ms$employment_female) # p=.88 thus variances are equal
ftest_post = var.test(ms$postdoc_male,ms$postdoc_female) # p=.85 thus variances are equal

#Mann-Whitney U
mwu_emp = wilcox.test(ms$employment_male,ms$employment_female)
mwu_emp # p = 0.3289 (not significant, ranks are equal)

mwu_post = wilcox.test(ms$postdoc_male,ms$postdoc_female)
mwu_post # p = 0.8671 (not significant, ranks are equal)

effsize_emp = cohen.d(ms$employment_male,ms$employment_female) # d estimate = 0.26 (small)
effsize_post = cohen.d(ms$postdoc_male,ms$postdoc_female) # d estimate = 0.04 (negligible)

#even though absolute difference is not significant, males are still paid a higher salary than females

ms_emp <- ms %>% 
  select(field_of_study, employment_male, employment_female) %>% 
  rename(Male = employment_male, Female = employment_female) %>% 
  gather(key = sex, value = employment, 2:3)

ms_post <- ms %>% 
  select(field_of_study, employment_male, employment_female) %>% 
  rename(Male = employment_male, Female = employment_female) %>% 
  gather(key = sex, value = postdoc, 2:3)

ms_joined <- merge(ms_emp, ms_post, by = c('field_of_study', 'sex'))


View (ms_joined)

gender <- c("Female", "Male")
sex_color <- c("white", "gray60")

emp_med_plot <- ggplot(fs_joined, aes(x = sex, y = employment)) +
  geom_boxplot(aes(fill = sex)) +
  scale_fill_manual(values = sex_color) +
  theme_classic() +
  scale_x_discrete(labels = gender) +
  labs(x = "Sex", y = "Median Employed Doctoral Recipient Salary (dollars) (n = 30)") +
  theme(plot.title = element_text(hjust = 0.5))

emp_med_plot


post_med_plot <- ggplot(ms_joined, aes(x = sex, y = employment)) +
  geom_boxplot(aes(fill = sex)) +
  scale_fill_manual(values = sex_color) +
  theme_classic() +
  scale_x_discrete(labels = gender) +
  labs(x = "Sex", y = "Median Doctoral Recipient Starting Salary (dollars) (n = 30)") +
  theme(plot.title = element_text(hjust = 0.5))

post_med_plot

ms_plot <- ggplot(melt_ms, aes(x = variable, y = value)) +
  geom_boxplot() +
  labs(y = "Median Doctoral Recipient Starting Salary (dollars)", x = "") + 
  scale_color_manual(values = c("white", "gray60", "white", "gray60"),
  labels=c("Employed Males","Employed Females","Male Postdocs","Female Postdocs"), name = "Employment") +
  scale_x_discrete(labels=c("Employed Males","Employed Females","Male Postdocs","Female Postdocs")) +
  theme_classic()

ms_plot

Problem 3 (Jeremy):

hist_emp_m = ggplot(ms, aes(ms$employment_male)) + geom_histogram()
hist_emp_f = ggplot(ms, aes(ms$employment_female)) + geom_histogram()
hist_post_m = ggplot(ms, aes(ms$postdoc_male)) + geom_histogram()
hist_post_f = ggplot(ms, aes(ms$postdoc_female)) + geom_histogram()

ftest_emp = var.test(ms$employment_male,ms$employment_female) # p=.88 thus variances are equal
ftest_post = var.test(ms$postdoc_male,ms$postdoc_female) # p=.85 thus variances are equal

ttest_emp = t.test(ms$employment_male,ms$employment_female, var.equal = T)
ttest_emp
## 
##  Two Sample t-test
## 
## data:  ms$employment_male and ms$employment_female
## t = 0.72724, df = 28, p-value = 0.4731
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -9242.944 19418.544
## sample estimates:
## mean of x mean of y 
##   81297.8   76210.0
ttest_post = t.test(ms$postdoc_male,ms$postdoc_female, var.equal = T)
ttest_post
## 
##  Two Sample t-test
## 
## data:  ms$postdoc_male and ms$postdoc_female
## t = 0.11806, df = 28, p-value = 0.9069
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -5177.666  5810.999
## sample estimates:
## mean of x mean of y 
##  48850.00  48533.33

Problem 4 (Rachel):

# correlation of years since phd and years faculty service
yrs_corr <- fs %>% 
  ggplot(aes(x = years_since_PhD, y = years_faculty_service)) +
  geom_point(aes(color = sex), alpha = 0.5)
yrs_corr

#correlation years faculty service vs salary
service_corr <- fs %>% 
  ggplot(aes(x = years_faculty_service, y = salary)) +
  geom_point(aes(color = sex), alpha = 0.5)
service_corr

#filter by discipline -- should we run models for each discipline?
theoretical <- fs %>% 
  filter(discipline == "A")

applied <- fs %>% 
  filter(discipline == "B")

# Rank: reference level = profesor
fs$faculty_rank <- factor(fs$faculty_rank)
fs$faculty_rank <- fct_relevel(fs$faculty_rank, "Prof")
# everything model 
salary_lm1 <- lm(salary ~ faculty_rank + years_since_PhD + years_faculty_service + sex + discipline, data = fs)
plot(salary_lm1)

vif(salary_lm1)
##                           GVIF Df GVIF^(1/(2*Df))
## faculty_rank          2.013193  2        1.191163
## years_since_PhD       7.518936  1        2.742068
## years_faculty_service 5.923038  1        2.433729
## sex                   1.030805  1        1.015285
## discipline            1.064105  1        1.031555
# years since PhD = 7.5
# years faculty service = 5.9
# others less than 4
# years since phd and years faculty service is a bit redundant so maybe we don't need both?
salary_lm2 <- lm(salary ~ faculty_rank + years_faculty_service + sex + discipline, data = fs)
plot(salary_lm2)

vif(salary_lm2)
##                           GVIF Df GVIF^(1/(2*Df))
## faculty_rank          1.597441  2        1.124233
## years_faculty_service 1.627110  1        1.275582
## sex                   1.030803  1        1.015284
## discipline            1.029040  1        1.014416
#faculty rank, discipline, sex, years_since_PhD
salary_lm3 <- lm(salary ~ faculty_rank + discipline + sex + years_since_PhD, data = fs)
plot(salary_lm3)

vif(salary_lm3)
##                     GVIF Df GVIF^(1/(2*Df))
## faculty_rank    1.987205  2        1.187301
## discipline      1.055727  1        1.027486
## sex             1.028359  1        1.014080
## years_since_PhD 2.065517  1        1.437191
#sex, rank discipline
salary_lm4 <- lm(salary ~ faculty_rank + discipline + sex, data = fs)
plot(salary_lm4)

vif(salary_lm4)
##                  GVIF Df GVIF^(1/(2*Df))
## faculty_rank 1.034437  2        1.008500
## discipline   1.012236  1        1.006099
## sex          1.022339  1        1.011108
#rank, discipline, years_since_PhD
salary_lm5 <- lm(salary ~ faculty_rank + discipline + years_since_PhD, data = fs)
plot(salary_lm5)

vif(salary_lm5)
##                     GVIF Df GVIF^(1/(2*Df))
## faculty_rank    1.978933  2        1.186063
## discipline      1.054461  1        1.026869
## years_since_PhD 2.053425  1        1.432978
#AIC on all models
AIC(salary_lm1) #9094
## [1] 9093.826
AIC(salary_lm2) #9097
## [1] 9096.813
#lm1 has a lower AIC, but need to use judgmement do decide (look more into papers, think about whether service and years since phd are both needed)
AIC(salary_lm3) #9097
## [1] 9097.22
AIC(salary_lm4) #9095
## [1] 9095.454
AIC(salary_lm5) #9096
## [1] 9096.497
lm_table <- stargazer(salary_lm1, salary_lm2)
## 
## % Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu
## % Date and time: Mon, Dec 03, 2018 - 21:57:55
## \begin{table}[!htbp] \centering 
##   \caption{} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lcc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & \multicolumn{2}{c}{\textit{Dependent variable:}} \\ 
## \cline{2-3} 
## \\[-1.8ex] & \multicolumn{2}{c}{salary} \\ 
## \\[-1.8ex] & (1) & (2)\\ 
## \hline \\[-1.8ex] 
##  faculty\_rankAssocProf & $-$32,158.410$^{***}$ & $-$34,599.240$^{***}$ \\ 
##   & (3,540.647) & (3,382.515) \\ 
##   & & \\ 
##  faculty\_rankAsstProf & $-$45,066.000$^{***}$ & $-$49,159.640$^{***}$ \\ 
##   & (4,237.523) & (3,834.485) \\ 
##   & & \\ 
##  years\_since\_PhD & 535.058$^{**}$ &  \\ 
##   & (240.994) &  \\ 
##   & & \\ 
##  years\_faculty\_service & $-$489.516$^{**}$ & $-$88.779 \\ 
##   & (211.938) & (111.639) \\ 
##   & & \\ 
##  sexMale & 4,783.493 & 4,771.248 \\ 
##   & (3,858.668) & (3,878.005) \\ 
##   & & \\ 
##  disciplineB & 14,417.630$^{***}$ & 13,473.380$^{***}$ \\ 
##   & (2,342.875) & (2,315.498) \\ 
##   & & \\ 
##  Constant & 111,021.200$^{***}$ & 117,511.300$^{***}$ \\ 
##   & (5,480.204) & (4,658.714) \\ 
##   & & \\ 
## \hline \\[-1.8ex] 
## Observations & 397 & 397 \\ 
## R$^{2}$ & 0.455 & 0.448 \\ 
## Adjusted R$^{2}$ & 0.446 & 0.441 \\ 
## Residual Std. Error & 22,538.650 (df = 390) & 22,651.610 (df = 391) \\ 
## F Statistic & 54.195$^{***}$ (df = 6; 390) & 63.411$^{***}$ (df = 5; 391) \\ 
## \hline 
## \hline \\[-1.8ex] 
## \textit{Note:}  & \multicolumn{2}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
## \end{tabular} 
## \end{table}
lm_table
##  [1] ""                                                                                                           
##  [2] "% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu"
##  [3] "% Date and time: Mon, Dec 03, 2018 - 21:57:55"                                                              
##  [4] "\\begin{table}[!htbp] \\centering "                                                                         
##  [5] "  \\caption{} "                                                                                             
##  [6] "  \\label{} "                                                                                               
##  [7] "\\begin{tabular}{@{\\extracolsep{5pt}}lcc} "                                                                
##  [8] "\\\\[-1.8ex]\\hline "                                                                                       
##  [9] "\\hline \\\\[-1.8ex] "                                                                                      
## [10] " & \\multicolumn{2}{c}{\\textit{Dependent variable:}} \\\\ "                                                
## [11] "\\cline{2-3} "                                                                                              
## [12] "\\\\[-1.8ex] & \\multicolumn{2}{c}{salary} \\\\ "                                                           
## [13] "\\\\[-1.8ex] & (1) & (2)\\\\ "                                                                              
## [14] "\\hline \\\\[-1.8ex] "                                                                                      
## [15] " faculty\\_rankAssocProf & $-$32,158.410$^{***}$ & $-$34,599.240$^{***}$ \\\\ "                             
## [16] "  & (3,540.647) & (3,382.515) \\\\ "                                                                        
## [17] "  & & \\\\ "                                                                                                
## [18] " faculty\\_rankAsstProf & $-$45,066.000$^{***}$ & $-$49,159.640$^{***}$ \\\\ "                              
## [19] "  & (4,237.523) & (3,834.485) \\\\ "                                                                        
## [20] "  & & \\\\ "                                                                                                
## [21] " years\\_since\\_PhD & 535.058$^{**}$ &  \\\\ "                                                             
## [22] "  & (240.994) &  \\\\ "                                                                                     
## [23] "  & & \\\\ "                                                                                                
## [24] " years\\_faculty\\_service & $-$489.516$^{**}$ & $-$88.779 \\\\ "                                           
## [25] "  & (211.938) & (111.639) \\\\ "                                                                            
## [26] "  & & \\\\ "                                                                                                
## [27] " sexMale & 4,783.493 & 4,771.248 \\\\ "                                                                     
## [28] "  & (3,858.668) & (3,878.005) \\\\ "                                                                        
## [29] "  & & \\\\ "                                                                                                
## [30] " disciplineB & 14,417.630$^{***}$ & 13,473.380$^{***}$ \\\\ "                                               
## [31] "  & (2,342.875) & (2,315.498) \\\\ "                                                                        
## [32] "  & & \\\\ "                                                                                                
## [33] " Constant & 111,021.200$^{***}$ & 117,511.300$^{***}$ \\\\ "                                                
## [34] "  & (5,480.204) & (4,658.714) \\\\ "                                                                        
## [35] "  & & \\\\ "                                                                                                
## [36] "\\hline \\\\[-1.8ex] "                                                                                      
## [37] "Observations & 397 & 397 \\\\ "                                                                             
## [38] "R$^{2}$ & 0.455 & 0.448 \\\\ "                                                                              
## [39] "Adjusted R$^{2}$ & 0.446 & 0.441 \\\\ "                                                                     
## [40] "Residual Std. Error & 22,538.650 (df = 390) & 22,651.610 (df = 391) \\\\ "                                  
## [41] "F Statistic & 54.195$^{***}$ (df = 6; 390) & 63.411$^{***}$ (df = 5; 391) \\\\ "                            
## [42] "\\hline "                                                                                                   
## [43] "\\hline \\\\[-1.8ex] "                                                                                      
## [44] "\\textit{Note:}  & \\multicolumn{2}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\\\ "             
## [45] "\\end{tabular} "                                                                                            
## [46] "\\end{table} "

Problem 4 (Michael):

fs_numonly <- fs %>%
  select(years_since_PhD, years_faculty_service, salary)
cor(fs_numonly)
##                       years_since_PhD years_faculty_service    salary
## years_since_PhD             1.0000000             0.9096491 0.4192311
## years_faculty_service       0.9096491             1.0000000 0.3347447
## salary                      0.4192311             0.3347447 1.0000000
# relevel faculty_rank to assistant prof
fs$faculty_rank <- factor(fs$faculty_rank)
fs$faculty_rank <- fct_relevel(fs$faculty_rank, "AsstProf")


# models
predsalary1 <- lm(salary ~ faculty_rank + discipline + years_since_PhD + years_faculty_service + sex, data = fs)
plot(predsalary1, which=c(2,1))

# the residuals do not look homoskedastic to me. this will effect predictive capacity, but not necessarily coefficients. 

fs_a <- fs %>%
  filter(discipline == "A") 
fs_b <- fs %>%
  filter(discipline == "B")

predsalary_a <- lm(salary ~ faculty_rank + years_since_PhD + years_faculty_service + sex, data = fs_a) # only theoretical
plot(predsalary_a, which=c(2,1))

AIC(predsalary_a)
## [1] 4156.074
predsalary_b <- lm(salary ~ faculty_rank + years_since_PhD + years_faculty_service + sex, data = fs_b) # only applied
plot(predsalary_b, which=c(2,1))

AIC(predsalary_b)
## [1] 4924.781
# heteroskedasticity seems to come mostly from applied faculty. Have two different models? 
# now take years since phd out

predsalary_a_simp1 <- lm(salary ~ faculty_rank + years_faculty_service + sex, data = fs_a) # only theoretical
plot(predsalary_a_simp1, which=c(2,1))

AIC(predsalary_a_simp1)
## [1] 4170.93
predsalary_b_simp1 <- lm(salary ~ faculty_rank + years_faculty_service + sex, data = fs_b) # only applied
plot(predsalary_b_simp1, which=c(2,1))

AIC(predsalary_b_simp1)
## [1] 4923.253
# AIC is larger for theoretical, only slightly smaller for applied.
# I think it is best to run different models for applied and theoretical, as their variances are different and they respond differently to years_since_phd and years_faculty_service

Problem 4 (Jeremy):